In [3]:
from genpeds import Admissions, Enrollment, Graduation
import numpy as np
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook'
General Trends¶
In [4]:
admit_df = Admissions([2003,2013,2023]).run(merge_with_char=True)
enroll_df = Enrollment([1993,2003,2013,2023]).run(merge_with_char=True,student_level='undergrad')
grad_df = Graduation([2003,2013,2023]).run(merge_with_char=True,degree_level='bach')
In [5]:
for cat,df in [
('admissions',admit_df),
('enrollment',enroll_df),
('graduation',grad_df)
]:
print(f'2023 Observations ({cat}): {len(df.loc[df['year']==2023])}')
2023 Observations (admissions): 1972 2023 Observations (enrollment): 5647 2023 Observations (graduation): 2004
Empirical PDf and CDF¶
In [6]:
def plot_empirical_PMFandCDF(df,var):
dat = np.sort(df.loc[df[var].notnull(),var].round(0))
cdf = np.arange(1,len(dat)+1) / len(dat)
fig = make_subplots(1,2,subplot_titles=(f'CDF of {var} (2023)',f'Histogram of {var} (2023)'))
fig.add_trace(go.Scatter(
x=dat,
y=cdf,
mode='lines'
),row=1,col=1)
fig.add_trace(go.Histogram(
x=dat,histnorm='probability',marker_color='lightblue'
),row=1,col=2)
fig.update_layout(
showlegend=False,
width=1000, height=400,
template='plotly_dark',
margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 50}
)
fig.show()
In [7]:
# admissions
plot_empirical_PMFandCDF(admit_df.query('year==2023'),'accept_rate_men')
In [8]:
# enrollment
plot_empirical_PMFandCDF(enroll_df.query('year==2023'),'totmen_share')
In [9]:
# graduation
plot_empirical_PMFandCDF(grad_df.query('year==2023'),'gradrate_totmen')
Time Differences¶
In [10]:
# plot time differences
def plot_time_differences(df,yr1,yr2,var):
df1 = df.loc[df['year']==yr1]
df2 = df.loc[df['year']==yr2]
df1 = df1.loc[df1[var].notnull()]
df2 = df2.loc[df2[var].notnull()]
df_merged = pd.merge(df1,df2,on=['id'],suffixes=[f'_{yr1}',f'_{yr2}'])
share_increased = len(df_merged.query(f'{var}_{yr1} < {var}_{yr2}')) / len(df_merged)
line = np.arange(0,101)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=df_merged[f'{var}_{yr1}'],
y=df_merged[f'{var}_{yr2}'],
mode='markers',
marker_color='pink',
marker_opacity=.5
))
fig.add_trace(go.Scatter(
x=line,
y=line,
mode='lines',
marker_color='white'
))
fig.update_layout(
title={'text': f'{yr1} (x-axis) and {yr2} (y-axis): Points above line mean an increase in {var}',
'subtitle': {'text': f'Share of schools that INCREASED {var}: <b>{round(share_increased,2)}'}},
xaxis={'title':{'text': f'{var} ({yr1})'}}, yaxis={'title':{'text': f'{var} ({yr2})'}},
showlegend=False,
width=1000, height=400,
template='plotly_dark',
margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 50}
)
fig.show()
In [11]:
# admissions
plot_time_differences(admit_df,2003,2023,'accept_rate_men')
In [12]:
# enrollment
plot_time_differences(enroll_df,1993,2023,'totmen_share')
In [13]:
# graduation
plot_time_differences(grad_df,2003,2023,'gradrate_totmen')
In [14]:
def wtd_quantile(df,var,weight_var,quantile):
df = df.loc[df[var].notnull() & df[weight_var].notnull()]
var_arr = df[var].to_numpy()
weight_arr = df[weight_var].to_numpy()
srt_idx = np.argsort(var_arr)
srt_dat = var_arr[srt_idx]
srt_wt = weight_arr[srt_idx]
cum_wt = np.cumsum(srt_wt)
totwt = np.sum(srt_wt)
cutoff = totwt * quantile
cutoff_idx = np.searchsorted(cum_wt,cutoff,side='left')
return srt_dat[cutoff_idx]
def wtd_iqr(df,var,weight_var):
return (
wtd_quantile(df,var,weight_var,(1/4)),
wtd_quantile(df,var,weight_var,(2/4)),
wtd_quantile(df,var,weight_var,(3/4))
)
In [15]:
for df,totqry,var,ttl in [(admit_df,'tot = tot_enrolled','accept_rate_men','Male Acceptance Rate'),
(enroll_df,'tot = totmen + totwomen','totmen_share', 'Male Enrollment Share'),
(grad_df,'tot = totmen + totwomen','gradrate_totmen','Male Graduation Rate'),
(grad_df,'tot = totmen + totwomen','gradrate_totwomen','Female Graduation Rate')]:
print(f'Weighted Medians of {ttl}:')
for yr in [1993,2003,2013,2023]:
if df is not enroll_df and yr == 1993:
continue
df = df.eval(totqry)
val = wtd_quantile(df=df.query(f'year=={yr}'),var=var,weight_var='tot',quantile=.5)
print(f'{(yr)}: {round(val,2)}')
print()
Weighted Medians of Male Acceptance Rate: 2003: 71.13 2013: 65.09 2023: 74.01 Weighted Medians of Male Enrollment Share: 1993: 43.81 2003: 42.4 2013: 43.72 2023: 43.15 Weighted Medians of Male Graduation Rate: 2003: 51.66 2013: 55.75 2023: 60.92 Weighted Medians of Female Graduation Rate: 2003: 59.14 2013: 62.75 2023: 69.01
In [16]:
for df,totqry,var,ttl in [
(admit_df,'tot = tot_enrolled','accept_rate_men','Male Acceptance Rate'),
(enroll_df,'tot = totmen + totwomen','totmen_share', 'Male Enrollment Share'),
(grad_df,'tot = totmen + totwomen','gradrate_totmen', 'Male Graduation Rate'),
(grad_df,'tot = totmen + totwomen','gradrate_totwomen', 'Female Graduation Rate')
]:
print(f'Interquartile Range (weighted) (2023) of {ttl}:')
df = df.eval(totqry)
a,b,c = wtd_iqr(df.query('year==2023'),var,'tot')
print(f'25th: {round(a,2)}')
print(f'50th: {round(b,2)}')
print(f'75th: {round(c,2)}\n')
Interquartile Range (weighted) (2023) of Male Acceptance Rate: 25th: 53.42 50th: 74.01 75th: 86.14 Interquartile Range (weighted) (2023) of Male Enrollment Share: 25th: 38.84 50th: 43.15 75th: 46.88 Interquartile Range (weighted) (2023) of Male Graduation Rate: 25th: 47.38 50th: 60.92 75th: 75.5 Interquartile Range (weighted) (2023) of Female Graduation Rate: 25th: 57.01 50th: 69.01 75th: 81.72